import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession \
.builder \
.master("local[*]") \
.appName('SPL_Checkouts_Milestone') \
.getOrCreate()
sc = spark.sparkContext
df = spark.read.csv("2010-20_spl_physical_checkouts.csv", header=True, inferSchema= True)
spl = df.withColumn('CheckoutMonth', regexp_extract('CheckoutDateTime', r"([0-9\-]+)", 1))
spl = spl.withColumn('CheckoutDate', substring('CheckoutDateTime', 4, 2))
spl = spl.withColumn("Subjects", split(col("Subjects"), " "))
spl.show(1)
+--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+ | ID|CheckoutYear|BibNumber|ItemBarcode|ItemType|Collection| CallNumber| ItemTitle| Subjects| CheckoutDateTime|CheckoutMonth|CheckoutDate| +--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+ |20100128110900001...| 2010| 1287425|10027511244| acbk| nanf|641.822 WILLAN 1992|Perfect pasta|[Cookery, Pasta]|01/28/2010 11:09:...| 01| 28| +--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+ only showing top 1 row
spl.createOrReplaceTempView('spl')
spl_view = spark.sql("SELECT CheckoutDate, CheckoutMonth, CheckoutYear, Collection, ItemType, ItemTitle, Subjects FROM spl")
spl_view.createOrReplaceTempView('spl_view')
spl_view.columns
['CheckoutDate', 'CheckoutMonth', 'CheckoutYear', 'Collection', 'ItemType', 'ItemTitle', 'Subjects']
ils = pd.read_csv("Integrated_Library_System__ILS__Data_Dictionary.csv")
ils.set_index("Code", inplace=True)
ils.head()
| Description | Code Type | Format Group | Format Subgroup | Category Group | Category Subgroup | Age Group | |
|---|---|---|---|---|---|---|---|
| Code | |||||||
| cazover | CA7-zine collection oversize | ItemCollection | Book | Periodical | NaN | Adult | |
| caziner | CA7-zine collection reference | ItemCollection | Book | Periodical | NaN | Adult | |
| cazval | CA7-zine collection valuable mat. | ItemCollection | Book | Periodical | NaN | Adult | |
| nga | Northgate Branch | Location | NaN | NaN | NaN | NaN | NaN |
| hip | High Point Branch | Location | NaN | NaN | NaN | NaN | NaN |
spl_yearly = spark.sql("""
SELECT CheckoutYear, ItemType, COUNT(*) as NumCheckouts
FROM spl_view
GROUP BY CheckoutYear, ItemType
ORDER By NumCheckouts DESC;
""")
spl_yearly_df = spl_yearly.toPandas()
spl_yearly_df.head()
| CheckoutYear | ItemType | NumCheckouts | |
|---|---|---|---|
| 0 | 2010 | acbk | 2649827 |
| 1 | 2011 | acbk | 2486120 |
| 2 | 2013 | acbk | 2485961 |
| 3 | 2010 | acdvd | 2396572 |
| 4 | 2012 | acbk | 2319616 |
spl_yearly_df['Item Type'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Description'])
spl_yearly_df['Age Group'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Age Group'])
spl_yearly_df['Format'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Format Subgroup'])
spl_yearly_df['FormatType'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Format Group'])
spl_yearly_df['Category'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Category Group'])
spl_yearly_df.rename({"NumCheckouts": "Number of Checkouts"}, axis=1, inplace=True)
spl_yearly_df.head()
| CheckoutYear | ItemType | Number of Checkouts | Item Type | Age Group | Format | FormatType | Category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2010 | acbk | 2649827 | Book: Adult/YA | Adult | Book | Miscellaneous | |
| 1 | 2011 | acbk | 2486120 | Book: Adult/YA | Adult | Book | Miscellaneous | |
| 2 | 2013 | acbk | 2485961 | Book: Adult/YA | Adult | Book | Miscellaneous | |
| 3 | 2010 | acdvd | 2396572 | DVD: Adult/YA | Adult | Video Disc | Media | Miscellaneous |
| 4 | 2012 | acbk | 2319616 | Book: Adult/YA | Adult | Book | Miscellaneous |
fig1 = px.bar(spl_yearly_df.fillna("Other"),
x="CheckoutYear",
y="Number of Checkouts",
color="Format",
color_discrete_sequence=["forestgreen", "goldenrod", "tomato", "turquoise",
"indigo", "black", "blue", "orange",
"brown", "black", "teal", "grey", "orchid"],
facet_col="Age Group",
height=750, width=1000,
title="Seattle Public Library Physical Checkout Format (2010-2020)")
fig1.show()
fig2 = px.bar(spl_yearly_df,
x="CheckoutYear",
y="Number of Checkouts",
color="Age Group",
color_discrete_sequence=["goldenrod", "teal"],
height=500, width=750,
title="Seattle Public Library Physical Checkouts (2010-2020)")
fig2.show()
spl_yearly_equiptment_df = spl_yearly_df[spl_yearly_df.FormatType == 'Equipment']
spl_yearly_equiptment_df.head()
| CheckoutYear | ItemType | Number of Checkouts | Item Type | Age Group | Format | FormatType | Category | |
|---|---|---|---|---|---|---|---|---|
| 75 | 2018 | alaptop | 14375 | Laptop: Hourly Circulating | Adult | NaN | Equipment | Miscellaneous |
| 86 | 2019 | alaptop | 9809 | Laptop: Hourly Circulating | Adult | NaN | Equipment | Miscellaneous |
| 92 | 2018 | aceq | 7831 | Equipment: Adult/YA | Adult | NaN | Equipment | Miscellaneous |
| 93 | 2019 | aceq | 7746 | Equipment: Adult/YA | Adult | NaN | Equipment | Miscellaneous |
| 94 | 2016 | aceq | 7066 | Equipment: Adult/YA | Adult | NaN | Equipment | Miscellaneous |
fig3 = px.bar(spl_yearly_equiptment_df,
y="CheckoutYear",
x="Number of Checkouts",
color="Item Type",
color_discrete_sequence=["goldenrod", "olive",
"indigo", "brown", "teal", "grey", "red"],
facet_col="Age Group",
orientation='h',
height=600,
barmode="stack",
title="Seattle Public Library Physical Equipment Checkout (2010-2020)")
fig3.show()
spl_collections_yearly = spark.sql("""
SELECT CheckoutYear, CheckoutMonth, Collection, COUNT(*) as NumCheckouts
FROM spl_view
GROUP BY CheckoutYear, CheckoutMonth, Collection
ORDER By NumCheckouts DESC;
""")
spl_collections_yearly = spl_collections_yearly.toPandas()
spl_collections_yearly.head()
| CheckoutYear | CheckoutMonth | Collection | NumCheckouts | |
|---|---|---|---|---|
| 0 | 2010 | 03 | nadvd | 170502 |
| 1 | 2010 | 01 | nadvd | 169509 |
| 2 | 2010 | 06 | nadvd | 155156 |
| 3 | 2010 | 04 | nadvd | 153769 |
| 4 | 2010 | 05 | nadvd | 149921 |
spl_collections_yearly['Collection Type'] = spl_collections_yearly['Collection'].apply(lambda t: "Other" if t==np.nan else str(ils.loc[t]['Description']))
spl_collections_yearly = spl_collections_yearly.rename({"NumCheckouts": "Number of Checkouts"}, axis=1).dropna()
spl_collections_yearly.head()
| CheckoutYear | CheckoutMonth | Collection | Number of Checkouts | Type | Collection Type | |
|---|---|---|---|---|---|---|
| 0 | 2010 | 03 | nadvd | 170502 | NA-DVD, Fiction | NA-DVD, Fiction |
| 1 | 2010 | 01 | nadvd | 169509 | NA-DVD, Fiction | NA-DVD, Fiction |
| 2 | 2010 | 06 | nadvd | 155156 | NA-DVD, Fiction | NA-DVD, Fiction |
| 3 | 2010 | 04 | nadvd | 153769 | NA-DVD, Fiction | NA-DVD, Fiction |
| 4 | 2010 | 05 | nadvd | 149921 | NA-DVD, Fiction | NA-DVD, Fiction |
fig4 = px.bar(spl_collections_yearly,
x="CheckoutYear",
y="Number of Checkouts",
color="Collection Type",
color_discrete_sequence=px.colors.qualitative.G10,
# color_discrete_sequence=["goldenrod", "olive",
# "indigo", "brown", "teal", "grey", "red"],
height=1000, width=2500,
title="Seattle Public Library Collection Circulation By Year (2010-2020)")
fig4.show()
spl_collections_monthly = spark.sql("""
SELECT CheckoutMonth, Collection, COUNT(Collection) as NumCheckouts
FROM spl_view
GROUP BY CheckoutMonth, Collection
ORDER By NumCheckouts DESC;
""")
spl_collections_monthly = spl_collections_monthly.toPandas()
spl_collections_monthly.head()
| CheckoutMonth | Collection | NumCheckouts | |
|---|---|---|---|
| 0 | 01 | nadvd | 1129150 |
| 1 | 03 | nadvd | 1097255 |
| 2 | 02 | nadvd | 1027768 |
| 3 | 04 | nadvd | 1003426 |
| 4 | 07 | nadvd | 980306 |
spl_collections_monthly['Collection Type'] = spl_collections_monthly['Collection'].apply(lambda t: "Other" if t==np.nan else str(ils.loc[t]['Description']))
spl_collections_monthly = spl_collections_monthly.rename({"NumCheckouts": "Number of Checkouts"}, axis=1).dropna()
spl_collections_monthly['CheckoutMonth'] = spl_collections_monthly['CheckoutMonth'].apply(lambda t: int(t))
spl_collections_monthly.head()
| CheckoutMonth | Collection | Number of Checkouts | Collection Type | |
|---|---|---|---|---|
| 0 | 1 | nadvd | 1129150 | NA-DVD, Fiction |
| 1 | 3 | nadvd | 1097255 | NA-DVD, Fiction |
| 2 | 2 | nadvd | 1027768 | NA-DVD, Fiction |
| 3 | 4 | nadvd | 1003426 | NA-DVD, Fiction |
| 4 | 7 | nadvd | 980306 | NA-DVD, Fiction |
fig5 = px.bar(spl_collections_monthly,
x="CheckoutMonth",
y="Number of Checkouts",
color="Collection Type",
color_discrete_sequence=px.colors.qualitative.G10,
height=1000, width=2500,
title="Seattle Public Library Collection Circulation By Month (2010-2020)")
fig5.show()